library(ggplot2)
library(magrittr)
library(dplyr)
library(ggthemes)
library(ggpubr)
library(tidyverse)
library(plotly)
library(DT)
library(ggalt)
library(ggrepel)
library(rvest)
library(stringr)
library(plotly)
athletes<-read.csv("data/athletes_and_events.csv")
gdp_pop<-read.csv("data/gdp_pop.csv")
noc_regions<-read.csv("data/noc_regions.csv")
I am choosing to combined all German designations under German and all Russian designation under Russia. All other countries which have changed names over time, I left as is.
athletes$NOC[athletes$NOC %in% c("SAA","EUA","GDR","FRG")] <- "GER"
athletes$NOC[athletes$NOC %in% c("RUS","RU1","URS","EUN",'ROC')] <- "RUS"
colnames(gdp_pop)[colnames(gdp_pop) == 'Code'] <- 'NOC'
all_data <- athletes %>%
left_join(noc_regions, by = 'NOC') %>%
left_join(gdp_pop,by='NOC')%>%
filter(Season=="Winter")
all_data %>%
filter(Season == "Winter") %>%
group_by(region) %>%
summarise(Year_Count = n_distinct(Year)) %>%
arrange(desc(Year_Count)) %>%
head(15)
## # A tibble: 15 × 2
## region Year_Count
## <chr> <int>
## 1 Austria 22
## 2 Canada 22
## 3 Czech Republic 22
## 4 Finland 22
## 5 France 22
## 6 Hungary 22
## 7 Italy 22
## 8 Norway 22
## 9 Poland 22
## 10 Sweden 22
## 11 Switzerland 22
## 12 UK 22
## 13 USA 22
## 14 Belgium 20
## 15 Germany 20
top10 <- all_data %>%
filter(Season == "Winter") %>%
group_by(Country) %>%
summarise(count = n_distinct(Sport,Event,Sex,Medal)) %>%
arrange(desc(count))%>%
select(Country) %>%
head(10)
top10_byyear<- all_data %>%
filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")
, Country %in% top10$Country)%>%
group_by(Country,Year) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, Country)) %>%
arrange(desc(Medal_Count))
top10_byyear
## # A tibble: 206 × 3
## # Groups: Country [10]
## Country Year Medal_Count
## <chr> <int> <int>
## 1 United States 2010 37
## 2 Germany 2002 36
## 3 United States 2002 34
## 4 Germany 1988 33
## 5 Russia 2014 33
## 6 Germany 2010 30
## 7 Germany 1976 29
## 8 Germany 1998 29
## 9 Germany 2006 29
## 10 Russia 1988 29
## # … with 196 more rows
Here, I graphed this in two different ways. The first is a line graph with the top 10 countries all in one and a second where they are separated in facets.
plot1<-ggplot(top10_byyear,aes(Year,Medal_Count))+
#theme_tufte(base_size = 10)+
geom_point(aes(color=Country))+
geom_line(aes(color=Country))+
scale_color_brewer(palette="Spectral")+
#facet_grid(rows = vars(Country))+
#facet_wrap(~Country, ncol=3)+
labs(x="Year", y="Number of Medals", title="Number of Medals per Year")+
theme(legend.position="bottom")+
theme(plot.title=element_text(hjust = 0.5))
plot1
ggplot(top10_byyear,aes(Year,Medal_Count))+
#theme_tufte(base_size = 10)+
geom_point(aes(), color = "blue")+
geom_smooth(aes(), color = "blue",size = .5)+
facet_grid(rows = vars(Country))+
facet_wrap(~Country, ncol=3)+
labs(x="Year", y="Number of Medals", title="Number of Medals per Year")+
theme(legend.position="none")+
theme(plot.title=element_text(hjust = 0.5))
top10 <- all_data %>%
filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")) %>%
group_by(Country) %>%
summarise(count = n_distinct(Sport,Event,Sex,Medal,Year)) %>%
arrange(desc(count))%>%
select(Country) %>%
head(10)
totals_all_time<- all_data %>%
filter(Season == "Winter", Medal %in% c("Bronze","Silver","Gold")
, Country %in% top10$Country)%>%
group_by(Country,Medal) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, Country)) %>%
arrange(desc(Medal_Count))
ggplot(totals_all_time
, aes(x=reorder(Country, desc(Medal_Count), sum), y=Medal_Count, fill=factor(Medal, levels=c("Gold", "Silver", "Bronze")))) +
theme_tufte(base_size = 10) +
labs(fill = "Medal") +
labs(x="Country", y="Number of Medals", title="All Time Medals per Country") +
geom_bar(stat="identity") +
scale_fill_manual(values=c('#D6AF36','#A7A7AD','#A77044'))+
theme(plot.title=element_text(hjust = 0.5))
I would suggest the all-time medal counts by country to my editor. The graph with all of the countries over time is a little bit messy for me. The facets make it a little better, but I still feel like the graph not over time is simpler and better.
For this question, I am choosing to only look at the Gold medals.
totals_all_time<- all_data %>%
filter(Season == "Winter", Medal %in% c("Gold"), not(is.na(GDP.per.Capita)))%>%
group_by(Country,Medal,GDP.per.Capita,Population) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, Country)
, GDP_per_Gold = max(GDP.per.Capita)/n_distinct(Games, Year, Season, Medal, Event, Country)
, Population_per_Gold = max(Population)/n_distinct(Games, Year, Season, Medal, Event, Country)) %>%
arrange((Population_per_Gold))
totals_all_time$normal_rank <- rank(desc(totals_all_time$Medal_Count),ties.method = "min")
totals_all_time$GDP_rank <- rank(totals_all_time$GDP_per_Gold,ties.method = "min")
totals_all_time$Pop_rank <- rank(totals_all_time$Population_per_Gold,ties.method = "min")
pivoted <- totals_all_time %>%
gather(normal_rank:Pop_rank, key = "ranking", value = "Rank") %>%
arrange(Country)
pivoted$order <- case_when(pivoted$ranking =="normal_rank" ~ 1,
pivoted$ranking == "GDP_rank" ~ 2,
pivoted$ranking == "Pop_rank" ~ 3)
#pivoted
Top10Diff <- pivoted %>%
group_by(Country) %>%
summarise(range = max(Rank, na.rm=TRUE) - min(Rank, na.rm=TRUE)) %>%
arrange(desc(range)) %>%
head(10)
pivoted_top10 <-pivoted %>% filter(Country %in% Top10Diff$Country)
pivoted %>% filter(Country %in% Top10Diff$Country) %>%
ggplot(aes(x = reorder(ranking,order), y = Rank, group = Country)) +
theme_minimal()+
geom_line(aes(color = Country), size = 1) +
# ("China",'Croatia','Estonia','India','Estonia','Russia','Slovenia','Switz','Ukraine','US')
# ("Red",'Green','Green','Black','Black','Red','Green','Black','Black','Red')
scale_color_manual(values=c("Red",'forestgreen','forestgreen','Black','Black','Red','forestgreen','Black','Black','Red')) +
geom_point(aes(color = Country), size = 2) +
scale_y_reverse(breaks = 1:nrow(pivoted)) +
#scale_x_discrete(breaks = 1:10) +
theme(legend.position = 'none') +
geom_text(data = pivoted_top10 %>% filter(ranking == "Pop_rank"),
aes(label = Country, x = 3.2),
fontface = "bold", color = "Black", size = 2.5) +
labs(x = 'Ranking Type', y = 'Rank', title = "Powerhouses Fall in Population Per Gold Medal", subtitle = 'Gold Medal Ranking (Normal, Adjusted for GDP, Adjusted for Population)')
The graph above shows the top 10 countries with the largest difference between any of the ranks. It can be seen that some of the powerhouses (US, Russia, and China) all fall in the rankings when they are adjusted for population. I think this is kinda messy. It is good for looking at one country and seeing the different rankings, but it is bad at comparing different countries, so I separated it into two graphs below.
totals_all_time<- all_data %>%
filter(Season == "Winter", Medal %in% c("Gold"), not(is.na(GDP.per.Capita)))%>%
group_by(Country,Medal,GDP.per.Capita,Population) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, Country)
, GDP_per_Gold = max(GDP.per.Capita)/n_distinct(Games, Year, Season, Medal, Event, Country)
, Population_per_Gold = max(Population)/n_distinct(Games, Year, Season, Medal, Event, Country)) %>%
arrange((Population_per_Gold))
totals_all_time$"Normal Rank"<- rank(desc(totals_all_time$Medal_Count),ties.method = "min")
totals_all_time$"GDP Rank"<- rank(totals_all_time$GDP_per_Gold,ties.method = "min")
totals_all_time$"Pop Rank" <- rank(totals_all_time$Population_per_Gold,ties.method = "min")
totals_all_time$"GDP_Better" <- case_when(totals_all_time$"Normal Rank"< totals_all_time$"GDP Rank" ~ "GDP Rank Worse than Normal",
totals_all_time$"Normal Rank"> totals_all_time$"GDP Rank" ~ "GDP Rank Better than Normal",
TRUE ~ "Identical Ranks")
totals_all_time$'Pop_Better' <- case_when(totals_all_time$"Normal Rank"< totals_all_time$"Pop Rank" ~ "Pop Rank Worse than Normal",
totals_all_time$"Normal Rank"> totals_all_time$"Pop Rank" ~ "Pop Rank Better than Normal",
TRUE ~ "Identical Ranks")
ggplot(totals_all_time,aes(x=totals_all_time$"GDP Rank",y=totals_all_time$"Normal Rank"))+
theme_minimal()+
geom_point(aes(color=totals_all_time$"GDP_Better"))+
scale_color_manual(values=c("forestgreen","red",'blue'))+
labs(x="Gold Medal Rank - GDP Adjusted", y="Normal Gold Medal Rank", title="GDP Adjusted Rank by Normal Rank",color='Rank Type')+
theme(legend.position="bottom")+
theme(plot.title=element_text(hjust = 0.5))+
geom_text_repel(aes(label = Country),
color = "gray20", size = 1.9)+
scale_x_continuous(limits=c(0,35))+
scale_y_continuous(limits=c(0,35))+
geom_abline(linetype = "dashed")+
annotate("text",x=3,y=33,label = c("Better Ranking \nAdjusted for GDP"),size = 4, color = "ForestGreen")+
annotate("text",x=33,y=3,label = c("Worse Ranking \nAdjusted for GDP"),size = 4, color = "Red")
The graph above shows that Russia, United States, China, Italy, etc (all Countries in Green) performed better after adjusting for GDP. The countries in red performed worse after adjusting for GDP.
ggplot(totals_all_time,aes(x=totals_all_time$"Pop Rank",y=totals_all_time$"Normal Rank"))+
theme_minimal()+
geom_point(aes(color=totals_all_time$"Pop_Better"))+
scale_color_manual(values=c("blue","forestgreen",'red'))+
labs(x="Gold Medal Rank - Pop Adjusted", y="Normal Gold Medal Rank", title="Pop Adjusted Rank by Normal Rank",subtitle = "Superpowers Fall",color='Rank Type')+
theme(legend.position="bottom")+
geom_text_repel(aes(label = Country),
color = "gray20", size = 1.9)+
scale_x_continuous(limits=c(0,35))+
scale_y_continuous(limits=c(0,35))+
geom_abline(linetype = "dashed")+
annotate("text",x=3,y=33,label = c("Better Ranking \nAdjusted for Pop"),size = 4, color = "ForestGreen")+
annotate("text",x=33,y=3,label = c("Worse Ranking \nAdjusted for Pop"),size = 4, color = "Red")
The graph above shows that some of the olympic super powers: Russia, United States, Germany, China, Canada, etc (all Countries in red) performed much worse after adjusting for population. The countries in green performed better after adjusting for Population.
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
winter_hosts<-hosts %>% filter(Winter != "", Year<='2016') %>%
rename(Host_City = City) %>%
rename(Host_Country = Country)%>%
select(Host_City, Host_Country, Year)
#winter_hosts
all_with_host <- all_data %>%
left_join(winter_hosts, by = 'Year')
all_with_host$host_nationality_flag[all_with_host$Host_Country == all_with_host$Country] <- "Host"
all_with_host$host_nationality_flag[all_with_host$Host_Country != all_with_host$Country] <- "Not_Host"
hosting_averages<- all_with_host %>%
filter(Medal %in% c("Gold",'Silver','Bronze'))%>%
filter(Country %in% winter_hosts$Host_Country) %>%
group_by(Country,host_nationality_flag) %>%
summarise(Average_Medals_Per_Olympics = n_distinct(Games, Year, Season, Medal, Event, Country)/ n_distinct(Games,Year,Country))%>%
#spread(key = host_nationality_flag, value = Medal_Count) %>%
arrange(Country)
data<-hosting_averages%>%
spread(key = host_nationality_flag, value = Average_Medals_Per_Olympics) %>%
arrange(Not_Host)
data$Difference <- round(data$Host - data$Not_Host,2)
data$Positive_Color <- case_when(data$Difference < 0 ~ "#EF2A2A",
data$Difference >0 ~"#38CF4C")
data$Positive <- case_when(data$Difference < 0 ~ 0,
data$Difference >0 ~ 1)
hosting_averages<- hosting_averages %>%
left_join(data, by = 'Country')
#hosting_averages
#data
plot2<-ggplot() +
theme_minimal() +
geom_point(data=hosting_averages,
aes(y=reorder(Country,Difference)
, x=Average_Medals_Per_Olympics, fill=host_nationality_flag),
size=6, shape=21, color="grey30") +
geom_segment(data=data,
aes(y=fct_reorder2(Country,Not_Host,Positive), yend=fct_reorder2(Country,Not_Host,Positive)
, x=Not_Host, xend=Host,color=Positive_Color),
size=1.8,
lineend="butt", linejoin="mitre",
arrow=arrow(length = unit(0.01, "npc"), type="closed")) +
scale_color_identity() +
scale_fill_manual(values=c("gold","grey"),labels = c("Host", "Not Host"))+
labs(x="Average Medals Per Olympics", y=NULL, title="Average Medals per Olympics by Hosting Status",fill='Hosting Status')+
geom_rect(data, mapping = aes(xmin=21, xmax=25, ymin=-Inf, ymax=Inf),fill="light blue") +
geom_text(data, mapping = aes(label=Difference, y=Country, x=23), size=3) +
geom_text(data=filter(data, Country=="Canada"),
mapping = aes(x=23, y=Country, label="Difference"),
size=3.1, vjust=-2, fontface="bold") +
scale_x_continuous(limits=c(0, 25)) +
scale_y_discrete(expand=c(0.14,0))
plot2
Of the countries who have hosted, 6 of the countries have a higher average medals per Olympics hosted than not hosted. Some countries appear to have more of an advantage when they hosted. For example, Only 3 seem to have a disadvantage when hosting: Austria, Switzerland, and Germany.
#all_data
successful<-all_data %>%
filter(Medal %in% c("Gold",'Silver','Bronze'))%>%
group_by(ID, Name, Sex) %>%
summarise(Medal_Count = n_distinct(Sport,Event,Sex,Medal,Year)) %>%
arrange(desc(Medal_Count))%>%
head(17)
successful
## # A tibble: 17 × 4
## # Groups: ID, Name [17]
## ID Name Sex Medal_Count
## <int> <chr> <chr> <int>
## 1 11951 "Ole Einar Bjrndalen" M 13
## 2 9747 "Stefania Belmondo" F 10
## 3 11943 "Marit Bjrgen" F 10
## 4 112111 "Raisa Petrovna Smetanina" F 10
## 5 28751 "Ursula \"Uschi\" Disl" F 9
## 6 54647 "Edy Sixten Jernberg" M 9
## 7 92566 "Claudia Pechstein" F 9
## 8 132791 "Lyubov Ivanovna Yegorova" F 9
## 9 20 "Kjetil Andr Aamodt" M 8
## 10 3604 "Viktor An" M 8
## 11 32700 "Karin Enke-Kania (-Busch-, -Richter)" F 8
## 12 35539 "Sven Fischer" M 8
## 13 43154 "Ricco Gro" M 8
## 14 64799 "Galina Alekseyevna Kulakova" F 8
## 15 86067 "Gunda Niemann-Stirnemann-Kleemann" F 8
## 16 88298 "Apolo Anton Ohno" M 8
## 17 131897 "Irene Karlijn \"Ireen\" Wst" F 8
ggplot(data=successful
, aes(x=Medal_Count, y=reorder(reorder(Name,Sex), Medal_Count, sum), fill=factor(Sex))) +
theme_minimal()+
labs(fill = "Sex") +
labs(x="Medal Count", y="Athlete", title="Top 17 Most Successful Athletes") +
geom_bar(stat="identity") +
scale_fill_manual(values=c('maroon','sky blue'))+
theme(plot.title=element_text(hjust = 0.5))
The chart above shows the total medals earned by the 17 most successful winter athletes by the total numbers of medals won.
athlete_dimensions <- all_data %>%
filter(Medal %in% c("Gold",'Silver','Bronze'),not(is.na(Height)),not(is.na(Weight))) %>%
distinct(Name, Sport,Sex,Height, Weight)
ave_dimensions<- athlete_dimensions %>%
group_by(Sport,Sex) %>%
summarise(Avg_Height = mean(Height)
, Ave_Weight = mean(Weight))
ggplot(data=ave_dimensions
, aes(x=Avg_Height, y=Ave_Weight, color =Sex)) +
theme_minimal()+
labs(x="Average Height(cm)", y="Average Weight", title="Average Height and Weight of Medal Winners per Sport") +
geom_point() +
theme(plot.title=element_text(hjust = 0.5))+
geom_text_repel(aes(label = Sport),
color = "gray20", size = 1.9)
The graph above displays the average height and weight of Olympic athletes who won medals. The graph is also grouped by Sex.
ggplotly(plot1)
I like the graph above a little bit more when it is interactive. A user can click on the countries to tell which is which.
athlete_dimensions <- all_data %>%
filter(Medal %in% c("Gold",'Silver','Bronze'),not(is.na(Height)),not(is.na(Weight))) %>%
distinct(Name, Sport,Sex, Height, Weight)
ave_dimensions<- athlete_dimensions %>%
group_by(Sport,Sex) %>%
summarise(Avg_Height = mean(Height)
, Ave_Weight = mean(Weight))
fig = plot_ly(ave_dimensions, x=~Avg_Height, y=~Ave_Weight
, color=~Sex,colors = c("red", "blue")
, type = "scatter", mode='markers'
, text = ~paste('Sport: ', Sport)) %>% layout(title = 'Average Height and Weight of Medal Winners per Sport')
fig
I like this version of the graph because in the static version, some of the labels were hard to tell which point they belonged to. In this version, the user can hover on the point and see the exact statistics as well as the exact sport the point belongs to.
table_data<- all_data %>%
filter(Season == "Winter", Medal %in% c("Gold",'Silver','Bronze'), not(is.na(GDP.per.Capita)))%>%
group_by(Country,Medal,GDP.per.Capita,Population) %>%
summarise(Medal_Count = n_distinct(Games, Year, Season, Medal, Event, Country),
Athlete_Count = n_distinct(Name)
) %>%
arrange((Country))
datatable(totals_all_time, rownames = FALSE,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:")))